Today’s Goal

Prep

Load packages

if (!require(ggwaffle)) {
  remotes::install_github("liamgilbey/ggwaffle")
}
pacman::p_load(tidyverse,
               gapminder,
               gcookbook,
               ggwaffle,
               ggcharts,
               ggridges,
               GGally,
               emojifont)

Helper functions

Source: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper%20functions

summarySE <- function(data = NULL, 
                      measurevar, 
                      groupvars = NULL, 
                      na.rm = FALSE,
                      conf.interval = .95, 
                      .drop = TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm = FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop = .drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm = na.rm),
          mean = mean   (xx[[col]], na.rm = na.rm),
          sd   = sd     (xx[[col]], na.rm = na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

Comparing Categories

Bar Charts

A simple bar chart.

data(mpg)

bar1 <- ggplot(mpg, aes(x = manufacturer)) +
  geom_bar()
plot(bar1)

Rotate the plot.

bar2 <- ggplot(mpg, aes(y = manufacturer)) +
  geom_bar()
plot(bar2)

Alternatively, you can do:

bar2b <- bar1 + 
  coord_flip()
plot(bar2b)

Reverse the order of the category.

bar3 <- ggplot(mpg, aes(y = reorder(manufacturer, desc(manufacturer)))) +
  geom_bar() +
  labs(y = "Manufacturer")
plot(bar3)

Proportion

bar4 <- ggplot(mpg, 
               aes(x = (after_stat(count) / sum(after_stat(count))),
                   y = reorder(manufacturer, desc(manufacturer)))) +
  geom_bar() +
  labs(y = "Manufacturer",
       x = "Proportion")
plot(bar4)

Paired Bar Charts

data(Titanic)
Titanic <- as_tibble(Titanic)

pbar1 <- ggplot(Titanic, aes(x = Class, y = n, fill = Sex)) +
  geom_bar(stat = "identity", 
           position = "dodge") +
  scale_fill_brewer(palette = "Set2")
plot(pbar1)

pbar2 <- ggplot(Titanic, aes(x = Sex, y = n, fill = Class)) +
  geom_bar(stat = "identity",
           position = "dodge") +
  scale_fill_brewer(palette = "Accent")
plot(pbar2)                

Stacked Bar

sbar1 <- ggplot(Titanic, aes(x = Class, y = n, fill = Sex)) +
  geom_bar(stat = "identity", 
           position = "stack") +
  scale_fill_brewer(palette = "Set2")
plot(sbar1)

sbar2 <- ggplot(Titanic, aes(x = Sex, y = n, fill = Class)) +
  geom_bar(stat = "identity",
           position = "stack") +
  scale_fill_brewer(palette = "Accent")
plot(sbar2)                

sbar3 <- ggplot(Titanic, aes(x = Sex, y = n, fill = Class)) +
  geom_bar(stat = "identity",
           position = "fill") +
  scale_fill_brewer(palette = "Accent")
plot(sbar3)                

Diverging Bars

gapminder_sub <- gapminder |> 
  filter(year %in% c(1997, 2007)) |> 
  filter(continent == "Asia") |> 
  select(country, year, gdpPercap) |> 
  pivot_wider(id_cols = country,
              names_from = year,
              names_prefix = "year_",
              values_from = gdpPercap) |> 
  mutate(gdp_change = (year_2007 - year_1997) / year_1997)
  

dbar1 <- ggplot(gapminder_sub, aes(y = country, x = gdp_change)) +
  geom_bar(stat = "identity") +
  labs(x = "GDP change", y = "Country")
plot(dbar1)

dbar2 <- ggplot(gapminder_sub, aes(y = country, x = gdp_change,
                                   fill = gdp_change > 0)) +
  geom_bar(stat = "identity") +
  labs(x = "GDP change", y = "Country") +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "none")
plot(dbar2)

dbar3 <- ggplot(gapminder_sub, aes(y = reorder(country, gdp_change),
                                   x = gdp_change,
                                   fill = gdp_change > 0)) +
  geom_bar(stat = "identity") +
  labs(x = "GDP change", y = "Country") +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position = "none")
plot(dbar3)

data(starwars)
starwars_chars <- starwars |> 
  filter(!is.na(homeworld)) |> 
  group_by(homeworld, gender) |> 
  summarize(average_height = mean(height, na.rm = TRUE),
            .groups = "drop") |> 
  group_by(homeworld) |> 
  filter(n() == 2) |> 
  mutate(average_height = ifelse(gender == "feminine",
                                 average_height,
                                 -1 * average_height))

dbar4 <- starwars_chars |> 
  ggplot(aes(y = homeworld, x = average_height, fill = gender)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("dodgerblue", "tomato")) +
  scale_x_continuous(breaks = seq(-200, 200, by = 100),
                     labels = c(200, 100, 0, 100, 200)) +
  labs(x = "Average height",
       y = "Homeworld")
plot(dbar4)

Dot Plot

tophit <- tophitters2001[1:25, ] # Take the top 25 from the tophitters data set

dplot1 <- ggplot(tophit, aes(x = avg, y = name)) +
  geom_point()
plot(dplot1)

dplot2 <- ggplot(tophit, aes(x = avg, y = reorder(name, avg))) +
  geom_point(size = 3) +  
  theme_bw() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.major.y = element_line(color = "grey60", 
                                          linetype = "dashed")) +
  labs(x = "Average", y = "Name")
plot(dplot2)

dplot3 <- ggplot(tophit, aes(x = reorder(name, avg), y = avg)) +
  geom_point(size = 3) +  # Use a larger dot
  theme_bw() +
  theme(panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_line(color = "grey60", 
                                          linetype = "dashed"),
        axis.text.x = element_text(angle = 60, hjust = 1)) +
  labs(x = "Name", y = "Average")
plot(dplot3)

dplot4 <- ggplot(tophit, aes(x = avg, y = reorder(name, avg))) +
  geom_segment(aes(yend = name), xend = 0, color = "grey50") +
  geom_point(size = 3, aes(colour = lg)) +
  scale_color_brewer(palette = "Set1", 
                     name = "League",
                     limits = c("NL", "AL")) +
  theme_bw() +
  theme(panel.grid.major.y = element_blank(), 
        legend.position = "bottom") +
   labs(x = "Average", y = "Name")
plot(dplot4)

dplot5 <- ggplot(tophit, aes(x = avg, y = reorder(name, avg))) +
  geom_segment(aes(yend = name), xend = 0, color = "grey50") +
  geom_point(size = 3, aes(colour = lg)) +
  scale_colour_brewer(palette = "Set1") +
  theme_bw() +
  theme(panel.grid.major.y = element_blank(),
        legend.position = "none") +
  facet_grid(rows = vars(lg), 
             scales = "free_y", 
             space = "free_y") +
   labs(x = "Average", y = "Name")
plot(dplot5)

Mosaic Plot

download.file(
  url = "https://github.com/yukiyanai/quant-methods-R/raw/master/data_fixed/hr96-17.csv",
  dest = "data/hr96-17.csv")

Or download the data set manually.

HR09 <- read_csv("data/hr96-17.csv") |> 
  filter(year == 2009) |> 
  mutate(wl = factor(wl, 
                     levels = 0 : 2, 
                     labels = c('lost', 'won', 'zombie')),
           status = factor(status, 
                           levels = 0 : 2, 
                           labels = c('challenger', 'incumbent', 'ex-member')))

# Cross Table
(tbl_st_wl2 <- with(HR09, table(wl, status)))
##         status
## wl       challenger incumbent ex-member
##   lost          558       170        14
##   won            80       175        45
##   zombie         43        52         2
# total num. of candidates by status
tbl_st <- with(HR09, table(status))
# turn the table into a matrix
tbl_st_wl2 <- as.matrix(tbl_st_wl2[1:3, 1:3])
tbl_st <- as.matrix(tbl_st[1:3])
# Create a data frame
# make variables of the share of each status and
# of the share of each result for each status
df <- data.frame(status = levels(HR09$status),
                 status_pct = 100 * as.vector(tbl_st / sum(tbl_st)),
                 win = 100 * as.vector(tbl_st_wl2[2,] / tbl_st),
                 zombie = 100 * as.vector(tbl_st_wl2[3,] / tbl_st),
                 lose = 100 * as.vector(tbl_st_wl2[1,] / tbl_st))
# Calculate the boundaries of categories on the horizontal axis
df <- df |> 
    mutate(xmax = cumsum(status_pct),
           xmin = xmax - status_pct)
## Remove the variable status_pct because we won't use it
df$status_pct <- NULL
# Reshape the data frame
df_long <- df %>% gather(key = result, value = voteshare, win:lose)
##################################################
## COMPARE df_long with df
## to understand what gather() has accomplished
## df
## df_long
###################################################
# Calculate the boundaries of categories on the vertical axis
df_long_1 <- df_long %>%
    group_by(status) %>%
    mutate(ymax = cumsum(voteshare),
           ymin = ymax - voteshare)
# Detemine the locations to put texts
df_long_1 <- df_long_1 %>%
    mutate(xtext = xmin + (xmax - xmin) / 2,
           ytext = ymin + (ymax - ymin) / 2)
# Make a mosaic plot
mosaic <- ggplot(df_long_1, aes(ymin = ymin, ymax = ymax,
                                xmin = xmin, xmax = xmax,
                                fill = result))
mosaic <- mosaic +
    geom_rect(color = I('grey')) +
    geom_text(aes(x = xtext, y = ytext, label = paste0(round(voteshare), '%'))) +
    geom_text(aes(x = xtext, y = 103, label = status)) +
    labs(x = '', y = '')
plot(mosaic)

Waffle Charts

waffle_data <- waffle_iron(mpg, aes_d(group = class))

waffle1 <- ggplot(waffle_data, aes(x, y, fill = group)) + 
  geom_waffle()
plot(waffle1)

waffle2 <- ggplot(waffle_data, aes(x, y, fill = group)) + 
  geom_waffle() + 
  coord_equal() + 
  scale_fill_waffle() + 
  theme_waffle()
plot(waffle2)

waffle3 <- ggplot(waffle_data, aes(x, y, color = group)) + 
  geom_waffle(tile_shape = "circle", size = 4) + 
  coord_equal() + 
  scale_colour_waffle() + 
  theme_waffle()
plot(waffle3)

waffle_data$label = fontawesome("fa-twitter")

waffle4 <- ggplot(waffle_data, aes(x, y, color = group)) + 
  geom_text(aes(label = label), 
            family = "fontawesome-webfont", 
            size = 3) +
  scale_color_brewer(palette = "Set1") +
  coord_equal() + 
  theme_waffle()  
plot(waffle4)

Distribution

Histogram

hist1 <- ggplot(mpg, aes(x = cty)) +
  geom_histogram(color = "black") +
  labs(x = "City miles per gallon")
plot(hist1)

Change the bin width.

hist2 <- ggplot(mpg, aes(x = cty)) +
  geom_histogram(color = "black",
                 binwidth = 5) +
  labs(x = "City miles per gallon")
plot(hist2)

Shift bin boundaries.

hist3 <- ggplot(mpg, aes(x = cty)) +
  geom_histogram(color = "black",
                 binwidth = 2,
                 boundary = 0) +
  labs(x = "City miles per gallon")
plot(hist3)

Pyramid Chart

You can use ggcharts::pyrmid_chart()

data(popch)

pyramid1 <- pyramid_chart(popch, x = age, y = pop, group = sex,
                          bar_colors = c("darkorange", "darkblue"),
                          xlab = "Population") 
## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.

## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
plot(pyramid1)

Error Bars

Standard errors.

data(ToothGrowth)

tgc <- summarySE(ToothGrowth, 
                 measurevar = "len", 
                 groupvars = c("supp", "dose"))

tgc2 <- tgc |> 
  mutate(dose = factor(dose))


eb1 <- ggplot(tgc2, aes(x = dose, y = len, fill = supp)) + 
    geom_bar(position = "dodge",
             stat = "identity") +
    geom_errorbar(aes(ymin = len - se, ymax = len + se),
                  width = .2,                    
                  position = position_dodge(.9)) +
  scale_fill_brewer(palette = "Dark2",
                    name = "Supplement type") +
  labs(x = "Dose (mg / day)",
       y = "Tooth length")
plot(eb1)

95% confidence interval.

eb2 <- ggplot(tgc2, aes(x = dose, y = len, fill = supp)) + 
    geom_bar(position = "dodge",
             stat = "identity") +
    geom_errorbar(aes(ymin = len - ci, ymax = len + ci),
                  width = .2,                    
                  position = position_dodge(.9)) +
  scale_fill_brewer(palette = "Dark2",
                    name = "Supplement type") +
  labs(x = "Dose (mg / day)",
       y = "Tooth length")
plot(eb2)

Confidence Interval

95% confidence interval.

ci1 <- ggplot(mtcars, aes(x = mpg, y = wt)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE) +
  labs(x = "Miles / US gallon",
       y = "Weight (1000 lbs)")
plot(ci1)

99.9% confidence interval.

ci2 <- ggplot(mtcars, aes(x = mpg, y = wt)) +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, level = 0.999) +
  labs(x = "Miles / US gallon",
       y = "Weight (1000 lbs)")
plot(ci2)

Loess smoother with 87% confidence interval.

ci3 <- ggplot(mtcars, aes(x = mpg, y = wt)) +
  geom_point() +
  geom_smooth(method = "loess", se = TRUE, level = 0.87) +
  labs(x = "Miles / US gallon",
       y = "Weight (1000 lbs)")
plot(ci3)

Box-and-whisker Plot

p <- ggplot(mpg, aes(x = class, y = hwy)) 
box1 <- p + 
  geom_boxplot()
plot(box1)

Rotate:

box2 <- ggplot(mpg, aes(y = class, x = hwy)) +
  geom_boxplot()
plot(box2)

Or

plot(box1 + coord_flip())

Width proportion to the square-roots fo the number of observations of each group.

box3 <- p + 
  geom_boxplot(varwidth = TRUE)
plot(box3)

Highlight (potential) outliers.

box4 <- p + 
  geom_boxplot(outlier.color = "red",
               outlier.shape = 1)
plot(box4)

With original data points with jitter.

box5 <- p + 
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2)
plot(box5)

box6 <- p +
  geom_boxplot(aes(fill = drv))
plot(box6)

Violin Chart

violin1 <- p + 
  geom_violin()
plot(violin1)

Rotate:

violin2 <- ggplot(mpg, aes(y = class, x = hwy)) +
  geom_violin()
plot(violin2)

Or

plot(violin1 + coord_flip())

With original data points with jitter.

violin3 <- p + 
  geom_violin() + 
  geom_jitter(width = 0.2)
plot(violin3)

Violin chart with box-and-whisker plots

violin4 <- p +
  geom_violin() +
  geom_boxplot(fill = "gray",
               width = 0.2)
plot(violin4)

Ridgeline Plot

ridge1 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
  geom_density_ridges()
plot(ridge1)

Closed polygon

ridge2 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
  geom_density_ridges2()
plot(ridge2)

Control the overlaps: “scale = 1” touches exactly

ridge3 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
  geom_density_ridges(scale = 1)
plot(ridge3)

Control the overlaps: “scale < 1” makes no overlap

ridge4 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
  geom_density_ridges(scale = 0.9)
plot(ridge4)

Control the overlaps: “scale > 1” overlaps

ridge5 <- ggplot(iris, aes(x = Sepal.Length, y = Species)) +
  geom_density_ridges(scale = 3)
plot(ridge5)

ridge6 <- ggplot(lincoln_weather, aes(x = `Mean Temperature [F]`, 
                                      y = Month, 
                                      fill = stat(x))) +
  geom_density_ridges_gradient(scale = 3, rel_min_height = 0.01) +
  scale_fill_viridis_c(name = "Temp. [F]", option = "C") +
  labs(title = "Temperatures in Lincoln NE in 2016")
plot(ridge6)

ridge7 <- ggplot(iris, aes(x = Sepal.Length, 
                           y = Species, 
                           fill = factor(stat(quantile))),
                 alpha = 1/5) +
  stat_density_ridges(
    geom = "density_ridges_gradient",
    calc_ecdf = TRUE,
    quantiles = c(0.025, 0.975)
  ) +
  scale_fill_manual(
    name = "Probability", 
    values = c("#FF0000A0", "#A0A0A0A0", "#0000FFA0"),
    labels = c("(0, 0.025]", "(0.025, 0.975]", "(0.975, 1]")
  )
plot(ridge7)

Coefficient Plot

You can use GGally::ggcoef().

fit <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width, 
          data = iris)
coef1 <- ggcoef(fit)
plot(coef1)

Exclude the intercept.

coef2 <- ggcoef(fit,
                exclude_intercept = TRUE)
plot(coef2)

Customize: 99% confidence interval instead of 95%.

coef3 <- ggcoef(fit, 
                exclude_intercept = TRUE,
                conf.level = 0.99,
                color = "royalblue",
                size = 5)
plot(coef3)

Time and Space

Will learn in Lectures 11 and 12.